In [1]:
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
from statsmodels.stats.multitest import multipletests

from scipy.ndimage.filters import gaussian_filter as gf
In [2]:
from scipy.stats import truncnorm
from scipy.optimize import curve_fit
In [3]:
from tools import *
In [4]:
%matplotlib inline
plt.rcParams['legend.numpoints'] = 1
plt.rcParams['figure.dpi'] = 150
#plt.rcParams['figure.figsize'] = 5, 4
plt.rcParams['font.size'] = 8
In [5]:
resolution = 10000
n_bins = 791
In [6]:
input_fo='/home/lv/working_data/Streptomyces/18_D1186_III/2020_09/SCN_good_corrected/'
#os.listdir(input_fo)
In [7]:
batches=['L216','L263','LBM5']
counts={t:np.genfromtxt(input_fo+'%s_2M_corr.txt'%t) for t in batches}

Compartments location

In [8]:
comp_bounds=[128,561] #from np.argwhere(frontier_index[batch][1]>0.045), see below

Frontier maps

In [9]:
gaussian_width=1 #to mitigate noise in generate_frontiers
max_s=60 #to mitigate noise in compute_frontier_indexes
In [10]:
domain_fo='figures/domains_gw=%.1f_max_s=%d/'%(gaussian_width,max_s)
if not os.path.exists(domain_fo): os.mkdir(domain_fo)
In [11]:
#mininum positive values to compute pseudo_counts
min_positive_counts={t:np.min(counts[t][counts[t]>0]) for t in batches}
max_counts={t:np.max(counts[t]) for t in batches} 
In [12]:
#pseudo_counts={t:min_positive_counts[t]/2 for t in batches}
pseudo_counts={t:max_counts[t]/20 for t in batches}
In [13]:
frontiers_matrix = {t:generate_frontiers(np.log(counts[t]+pseudo_counts[t]),w=gaussian_width) for t in batches}
In [14]:
fig, axes = plt.subplots(figsize=(6, 12), ncols=2, nrows=3, tight_layout=True)

for i,t in enumerate(batches):
    axes[i][0].imshow(
        counts[t],
        norm=colors.LogNorm(),
        vmin=0.0005, vmax=0.1, cmap='bone_r')
    axes[i][0].set_title('HiC heat map - %s'%t)

    axes[i][1].imshow(
        frontiers_matrix[t][0],
        cmap="Greys",vmax=0.15);#,
#        vmin=0,
#        vmax=0.001);
    axes[i][1].set_title('Frontier signal - %s'%t);

fig.savefig('results/frontier_maps.pdf', transparent=True)
/home/lv/anaconda3/lib/python3.7/site-packages/matplotlib/figure.py:2366: UserWarning: This figure includes Axes that are not compatible with tight_layout, so results might be incorrect.
  warnings.warn("This figure includes Axes that are not compatible "

Frontier indexes

We circularize the chromosome. In effect, it means:

  • it may detect less efficienly frontiers at the edges of the chromosome
  • we need to remove the first and last bin in the analysis of peaks
In [15]:
frontier_index = {t:compute_frontier_indexes(frontiers_matrix[t],max_s=max_s) for t in batches}
In [16]:
batch_axes=['L263','LBM5']
fig, axes = plt.subplots(figsize=(8, 6), nrows=2, tight_layout=True)

loci=[i for i in range(len(frontier_index[batch_axes[0]][0]))]

for iplot in [0,1]:
    for i, (c, lab) in enumerate(zip(['C1','C2'], ['upstream', 'downstream'])):
        axes[iplot].plot(loci,[frontier_index[batch_axes[iplot]][i][l] for l in loci], '%s+-'%c,label=lab);
    axes[iplot].set_title(batch_axes[iplot])
    axes[iplot].set_ylim([0,7])
#for i, (c, lab) in enumerate(zip(['C1','C2'], ['upstream', 'downstream'])):
#    axes[1].plot(loci,[frontier_index[batch][i][l] for l in loci], '%s+-'%c,label=lab);
#axes[1].set_xlim([0,200])

#axes[0].set_title(batch)

axes[1].set_xlabel('locus=(HiC) matrix index');
for i in [0, 1]:    
    axes[i].set_ylabel('Frontier index');
    axes[i].legend(fontsize="medium");
    
fig.savefig('results/profiles.pdf', transparent=True)
/home/lv/anaconda3/lib/python3.7/site-packages/matplotlib/figure.py:2366: UserWarning: This figure includes Axes that are not compatible with tight_layout, so results might be incorrect.
  warnings.warn("This figure includes Axes that are not compatible "

Identification and analysis of peaks

In [17]:
#identification of all peaks with their value
peaks = {t:[find_peaks(frontier_index[t][i]) for i in range(2)] for t in batches}
In [18]:
#extract peaks values depending on compartments to perform stat analysis
peak_values={}
peak_values['core']={t:np.array([peaks[t][0][1][ip] for ip, p in enumerate(peaks[t][0][0]) if comp_bounds[0] <= p <= comp_bounds[1]]+
                                   [peaks[t][1][1][ip] for ip, p in enumerate(peaks[t][1][0]) if comp_bounds[0] <= p <= comp_bounds[1]])
                     for t in batches}

peak_values['borders']={t:np.array([peaks[t][0][1][ip] for ip, p in enumerate(peaks[t][0][0]) if 0<p<comp_bounds[0] or p > comp_bounds[1]]+
                                   [peaks[t][1][1][ip] for ip, p in enumerate(peaks[t][1][0]) if 0<p<comp_bounds[0] or p > comp_bounds[1]])
                  for t in batches}
In [19]:
#median (med) and median absolute deviaiton (mad)
#rem: compare to std, mad is a more robust estimate of the "variance of the null model"
med_peaks={comp:{t:np.median(peak_values[comp][t]) for t in batches} for comp in ['core','borders']}
mad_peaks={comp:{t:np.median(np.abs(peak_values[comp][t]-med_peaks[comp][t])) for t in batches} for comp in ['core','borders']}
In [20]:
rho=1.48
k_sigma=2
In [21]:
thresh={comp:{t:med_peaks[comp][t]+k_sigma*rho*mad_peaks[comp][t] for t in batches} for comp in ['core','borders']}
In [22]:
plt.figure(figsize=(6,4),tight_layout=True)

bins=np.arange(0,6,0.03)
comp='borders'
for it,batch in enumerate(batches):
    plt.subplot(2,2,it+1)
    
    plt.hist(peak_values[comp][batch],bins=bins,label=comp+'-'+batch);
    plt.axvline(med_peaks[comp][batch],color='k',linestyle='dashed')
    plt.axvline(thresh[comp][batch],color='r',linestyle='dashed')
    plt.legend(loc=1)
/home/lv/anaconda3/lib/python3.7/site-packages/matplotlib/figure.py:2366: UserWarning: This figure includes Axes that are not compatible with tight_layout, so results might be incorrect.
  warnings.warn("This figure includes Axes that are not compatible "
In [23]:
plt.figure(figsize=(6,4),tight_layout=True)

bins=np.arange(0,6,0.03)
comp='core'
for it,batch in enumerate(batches):
    plt.subplot(2,2,it+1)
    
    plt.hist(peak_values[comp][batch],bins=bins,label=comp+'-'+batch);
    plt.axvline(med_peaks[comp][batch],color='k',linestyle='dashed')
    plt.axvline(thresh[comp][batch],color='r',linestyle='dashed')
    plt.legend(loc=1)
/home/lv/anaconda3/lib/python3.7/site-packages/matplotlib/figure.py:2366: UserWarning: This figure includes Axes that are not compatible with tight_layout, so results might be incorrect.
  warnings.warn("This figure includes Axes that are not compatible "

Extract of significsnt peaks

Rem: here, significant = above an arbitrarly, yet reasonable thresholds

In [24]:
sign_peaks={} # [batch][i_FI][0]=loci, [1]=values
for batch in batches:
    sign_peaks[batch]=[[],[]]
    for i_FI in [0,1]:
        sign_peaks[batch][i_FI]=[[peaks[batch][i_FI][0][i] for i in range(len(peaks[batch][i_FI][0]))
                                  if ((2<=peaks[batch][i_FI][0][i]<=comp_bounds[0] and peaks[batch][i_FI][1][i]>=thresh['borders'][batch]) or
                                      (comp_bounds[0]<peaks[batch][i_FI][0][i]<=comp_bounds[1] and peaks[batch][i_FI][1][i]>=thresh['core'][batch]) or
                                      (comp_bounds[1]<peaks[batch][i_FI][0][i]<=n_bins-2 and peaks[batch][i_FI][1][i]>=thresh['borders'][batch]))],                                 
                                 [peaks[batch][i_FI][1][i] for i in range(len(peaks[batch][i_FI][0]))
                                  if ((2<=peaks[batch][i_FI][0][i]<=comp_bounds[0] and peaks[batch][i_FI][1][i]>=thresh['borders'][batch]) or
                                      (comp_bounds[0]<peaks[batch][i_FI][0][i]<=comp_bounds[1] and peaks[batch][i_FI][1][i]>=thresh['core'][batch]) or
                                      (comp_bounds[1]<peaks[batch][i_FI][0][i]<=n_bins-2 and peaks[batch][i_FI][1][i]>=thresh['borders'][batch]))]
                                ]
In [25]:
for batch in batches:
    print("%s"%batch)
    print("Number of upstream peaks:\n\tborders:%d\tcore:%d"%(len([p for p in sign_peaks[batch][0][0] if p <= comp_bounds[0] or p>=comp_bounds[1]]),
                                                   len([p for p in sign_peaks[batch][0][0] if p > comp_bounds[0] and p<comp_bounds[1]])))
    print("Number of downstream peaks:\n\tborders:%d\tcore:%d"%(len([p for p in sign_peaks[batch][1][0] if p <= comp_bounds[0] or p>=comp_bounds[1]]),
                                                   len([p for p in sign_peaks[batch][1][0] if p > comp_bounds[0] and p<comp_bounds[1]])))
    print('\n')
L216
Number of upstream peaks:
	borders:5	core:11
Number of downstream peaks:
	borders:4	core:9


L263
Number of upstream peaks:
	borders:9	core:12
Number of downstream peaks:
	borders:8	core:12


LBM5
Number of upstream peaks:
	borders:8	core:10
Number of downstream peaks:
	borders:9	core:7


In [26]:
batch='L263'
fig, axes = plt.subplots(figsize=(9, 6), ncols=3, nrows=2,tight_layout=True)

for irow in range(2):
    axes[irow, 0].matshow(
        counts[batch],
        norm=colors.LogNorm(),
        vmin=0.0005, vmax=0.1, cmap='bone_r')

    axes[irow, 1].plot(frontier_index[batch][0], "C1-",label="$FI_{up}$")
    axes[irow, 1].plot(frontier_index[batch][1], "C2-",label="$FI_{down}$")

    axes[irow, 1].axhline(thresh['borders'][batch],color='k',linestyle='dashed',label='thresh-borders')
    axes[irow, 1].axhline(thresh['core'][batch],color='k',linestyle='dotted',label='thresh-core')
    
    axes[irow, 2].matshow(frontiers_matrix[batch][0],
                cmap=plt.get_cmap('Greys'))#, vmin=0, vmax=0.001);
    [axes[irow, 2].plot([i, len(counts[batch])], [i, i], color="C1", alpha=0.7)
     for i in sign_peaks[batch][0][0]];
    [axes[irow, 2].plot([i, i], [i,len(counts[batch])], color="C1", alpha=0.7)
     for i in sign_peaks[batch][0][0]];

    [axes[irow, 2].plot([0, i], [i, i], color="C2", alpha=0.7)
     for i in sign_peaks[batch][1][0]];
    [axes[irow, 2].plot([i, i], [0, i], color="C2", alpha=0.7)
     for i in sign_peaks[batch][1][0]];
    
    # Set titles on the plots of the first row
    if irow == 0:
        axes[irow, 0].set_title('HiC', pad=0.5)
        axes[irow, 1].set_title('Frontier index (FI)')
        axes[irow, 2].set_title('Frontier matrix and FI peaks',pad=0.5)
    
    if irow == 0:
        for j in [0,2]:
            axes[irow, j].set_xlim([0,n_bins/2])
            axes[irow][j].set_ylim([n_bins/2,0])
            axes[irow, j].xaxis.set_ticks_position('bottom')
        axes[irow, 1].set_xlim([0, n_bins/2])
        axes[irow, 1].set_ylim([0, 6])
        axes[irow, 1].legend(loc=2,fontsize='large')
    else:
        for j in [0, 2]:
            axes[irow, j].set_xlim([n_bins/2, n_bins])
            axes[irow, j].set_ylim([n_bins, n_bins/2])
            axes[irow, j].xaxis.set_ticks_position('bottom')
        axes[irow, 1].set_xlim([n_bins/2, n_bins])
        axes[irow, 1].set_ylim([0, 6])
        axes[irow, 1].legend(loc=1,fontsize='large')
        
fig.savefig('results/example_frontier_peaks_L263.pdf')
/home/lv/anaconda3/lib/python3.7/site-packages/matplotlib/figure.py:2366: UserWarning: This figure includes Axes that are not compatible with tight_layout, so results might be incorrect.
  warnings.warn("This figure includes Axes that are not compatible "
In [27]:
batch='LBM5'
fig, axes = plt.subplots(figsize=(9, 6), ncols=3, nrows=2,tight_layout=True)

for irow in range(2):
    axes[irow, 0].matshow(
        counts[batch],
        norm=colors.LogNorm(),
        vmin=0.0005, vmax=0.1, cmap='bone_r')

    axes[irow, 1].plot(frontier_index[batch][0], "C1-",label="$FI_{up}$")
    axes[irow, 1].plot(frontier_index[batch][1], "C2-",label="$FI_{down}$")

    if irow==0:
        axes[irow, 1].axhline(thresh['borders'][batch],color='k',linestyle='dashed')
    else:
        axes[irow, 1].axhline(thresh['core'][batch],color='k',linestyle='dashed')
    
    axes[irow, 2].matshow(frontiers_matrix[batch][0],
                cmap=plt.get_cmap('Greys'))#, vmin=0, vmax=0.001);
    [axes[irow, 2].plot([i, len(counts[batch])], [i, i], color="C1", alpha=0.7)
     for i in sign_peaks[batch][0][0]];
    [axes[irow, 2].plot([i, i], [i,len(counts[batch])], color="C1", alpha=0.7)
     for i in sign_peaks[batch][0][0]];

    [axes[irow, 2].plot([0, i], [i, i], color="C2", alpha=0.7)
     for i in sign_peaks[batch][1][0]];
    [axes[irow, 2].plot([i, i], [0, i], color="C2", alpha=0.7)
     for i in sign_peaks[batch][1][0]];
    
    # Set titles on the plots of the first row
    if irow == 0:
        axes[irow, 0].set_title('HiC', pad=0.5)
        axes[irow, 1].set_title('Frontier index (FI)')
        axes[irow, 2].set_title('Frontier matrix and FI peaks',pad=0.5)
    
    if irow == 0:
        for j in [0,2]:
            axes[irow, j].set_xlim([0,n_bins/2])
            axes[irow][j].set_ylim([n_bins/2,0])
            axes[irow, j].xaxis.set_ticks_position('bottom')
        axes[irow, 1].set_xlim([0, n_bins/2])
        axes[irow, 1].set_ylim([0, 6])
        axes[irow, 1].legend(loc=2,fontsize='large')
    else:
        for j in [0, 2]:
            axes[irow, j].set_xlim([n_bins/2, n_bins])
            axes[irow, j].set_ylim([n_bins, n_bins/2])
            axes[irow, j].xaxis.set_ticks_position('bottom')
        axes[irow, 1].set_xlim([n_bins/2, n_bins])
        axes[irow, 1].set_ylim([0, 6])
        axes[irow, 1].legend(loc=1,fontsize='large')
        
fig.savefig('results/example_frontier_peaks_LBM5.pdf')
/home/lv/anaconda3/lib/python3.7/site-packages/matplotlib/figure.py:2366: UserWarning: This figure includes Axes that are not compatible with tight_layout, so results might be incorrect.
  warnings.warn("This figure includes Axes that are not compatible "

Output files

In [28]:
data_fo='results/gw=%.1f_max_s=%d/'%(gaussian_width,max_s)
if not os.path.exists(data_fo): os.mkdir(data_fo)
In [29]:
for batch in batches:
    #upstream
    with open(data_fo+'%s_upstream_peaks.txt'%batch,'w') as out:
#        for i in sorted([i for i in range(len(sign_peaks[batch][0][0]))],key=lambda i:-sign_peaks[batch][0][1][i]):
        for i in range(len(sign_peaks[batch][0][0])): out.write("%d\t%.5f\n"%(sign_peaks[batch][0][0][i]+1, sign_peaks[batch][0][1][i]))
      
    #downstream
    with open(data_fo+'%s_downstream_peaks.txt'%batch,'w') as out:
#        for i in sorted([i for i in range(len(sign_peaks[batch][1][0]))],key=lambda i:-sign_peaks[batch][1][1][i]):
        for i in range(len(sign_peaks[batch][1][0])): out.write("%d\t%.5f\n"%(sign_peaks[batch][1][0][i]+1, sign_peaks[batch][1][1][i]))
In [ ]:
 
In [ ]: